The description is at this link - https://www.kaggle.com/c/house-prices-advanced-regression-techniques
Loading required libraries into R workspace
library(knitr)
library(DataExplorer) #For data exploration
library(DescTools)
library(psych) # describe()
library(plyr)
library(rowr)
library(corrplot) # For graphical correlations
library(caret)
library(Hmisc) # For correlation matrix
library(rlist)
library(randomForest)
library(e1071) # tune() for random forestsReading the csv’s as dataframes into R.
setwd("/Users/chetanak/Box Sync/Projects/DataScience/R/house-prices-advanced-regression-techniques/")
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)The test and train datasets have 1460 and 1459 rows each respectively. The test and train datasets have 81 and 80 columns each respectively. A difference in the columns variables across two data sets reveal absence of ‘SalePrice’ predictor variable in test data. An ‘ID’ column variable is present in each of the datasets. However, this column is not essential in prediction rather this column in test data set is required for submission. Hence, saving the ‘test$ID’ values to a vector ‘test_ids’ and eliminating the ‘ID’ column in both the datasets. We now bind the datasets for exploratory analysis. After binding, the combined data set contains 2919 and 80 columns. Of these 80, 79 are predictor variables and one is a response variable, which is ‘SalePrice’
dim(train)
[1] 1460 81
dim(test)
[1] 1459 80
setdiff(names(train), names(test))
[1] "SalePrice"
test_ids <- test$Id
train$Id <- NULL
test$Id <- NULL
test$SalePrice <- NA
df <- rbind(train, test)
dim(df)
[1] 2919 80
str(df, list.len=10)
'data.frame': 2919 obs. of 80 variables:
$ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
$ MSZoning : chr "RL" "RL" "RL" "RL" ...
$ LotFrontage : int 65 80 68 60 84 85 75 NA 51 50 ...
$ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
$ Street : chr "Pave" "Pave" "Pave" "Pave" ...
$ Alley : chr NA NA NA NA ...
$ LotShape : chr "Reg" "Reg" "IR1" "IR1" ...
$ LandContour : chr "Lvl" "Lvl" "Lvl" "Lvl" ...
$ Utilities : chr "AllPub" "AllPub" "AllPub" "AllPub" ...
$ LotConfig : chr "Inside" "FR2" "Inside" "Corner" ...
[list output truncated]The best way to analyze each varaible is by grouping them under their respective category. After grouping the variabels I am going to,
Subsequent tabs above are going detail the steps taken to handle missing data, factorizing, encoding of the variables. Below is the variable grouping and the missing counts in each variable
garage_vars <- names(df[which(names(df) %like% "%Garage%")])
basement_vars <- names(df[which(names(df) %like% "%Bsmt%")])
pool_vars <- names(df[which(names(df) %like% "%Pool%")])
porch_vars <- names(df[which(names(df) %like% c("%Porch%", "%Deck%"))])
sale_vars <- names(df[which(names(df) %like% c("%Sale%", "%Sold"))])
lot_vars <- names(df[which(names(df) %like% c("%Lot%", "%Land%"))])
dwelling_vars <- names(df[which(names(df) %like% c("%SubClass%", "%Bldg%", "%HouseStyle%", "%Overall%", "%Year%"))])
exterior_vars <- names(df[which(names(df) %like% c("%Exter%", "%Roof%", "%MasVnr%", "%Foundation%", "%Street%", "%Alley%", "%PavedDrive%", "%Fence%"))])
utility_vars <- names(df[which(names(df) %like% c("%Heat%", "%Utilities%", "%Central%", "%Electrical%"))])
interior_vars <- names(df[which(names(df) %like% c("%Room%", "FullBath%", "HalfBath%", "%Kitchen%", "%Fire%", "%AbvGr%", "%Functional%", "%FlrSF%", "%LowQualFinSF%", "%GrLivArea%"))])
misc_vars <- names(df[which(names(df) %like% c("%Misc%"))])
zoning_vars <- names(df[which(names(df) %like% c("%Zoning%"))])
neighborhood_vars <- names(df[which(names(df) %like% c("%Neighborhood%", "Condition%"))])
vars_list = list(garage_vars, basement_vars, pool_vars, porch_vars, sale_vars, lot_vars,
dwelling_vars, exterior_vars, utility_vars, interior_vars, misc_vars,
zoning_vars, neighborhood_vars)
df_var <- as.data.frame(do.call(cbind.fill, c(vars_list, fill = NA)))
colnames(df_var) <- c("GarageVars", "BasementVars", "PoolVars", "PorchVars", "SaleVars", "LotVars",
"DwellingVars", "ExteriorVars", "UtilityVars", "InteriorVars", "MiscVars",
"ZomingVars", "NeighborhoodVars")
df_var#Show the missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA PoolQC MiscFeature Alley Fence SalePrice
2909 2814 2721 2348 1459
FireplaceQu LotFrontage GarageYrBlt GarageFinish GarageQual
1420 486 159 159 159
GarageCond GarageType BsmtCond BsmtExposure BsmtQual
159 157 82 82 81
BsmtFinType2 BsmtFinType1 MasVnrType MasVnrArea MSZoning
80 79 24 23 4
Utilities BsmtFullBath BsmtHalfBath Functional Exterior1st
2 2 2 2 1
Exterior2nd BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
1 1 1 1 1
Electrical KitchenQual GarageCars GarageArea SaleType
1 1 1 1 1
There are 7 Garage variables, out of which GarageYrBlt, GarageFinish, GarageQual, GarageCond have 159 NA’s, GarageType has 157 NA’s, GarageCars and Garage Area have 1 NA each.
df_NA[c(which(names(df_NA) %like% "Garage%"))]
GarageYrBlt GarageFinish GarageQual GarageCond GarageType
159 159 159 159 157
GarageCars GarageArea
1 1
cbind(sapply(df[garage_vars], class))
[,1]
GarageType "character"
GarageYrBlt "integer"
GarageFinish "character"
GarageCars "integer"
GarageArea "integer"
GarageQual "character"
GarageCond "character"First we need to check which of the 159 NA’s in the variables are common with 157 NA’s in GarageType
length(which(is.na(df$GarageYrBlt) & is.na(df$GarageFinish) & is.na(df$GarageQual) & is.na(df$GarageCond) & is.na(df$GarageType)))
## [1] 157There are 157 rows that are common across GarageYrBlt, GarageFinish, GarageQual, GarageCond, GarageType Let’s fix the values for these variables by selecting the rows based on GarageType NA’s since we know for sure that GarageType NA’s are in the 159 NA’s for other Garage variables. We assign the value ‘None’ to all NA’s based on the variable description in the previous tab.
df$GarageFinish[is.na(df$GarageType)] <- 'None'
df$GarageQual[is.na(df$GarageType)] <- 'None'
df$GarageCond[is.na(df$GarageType)] <- 'None'
df$GarageYrBlt[is.na(df$GarageType)] <- 0 # Integer Variable
df$GarageType[is.na(df$GarageType)] <- 'None'
#Show the missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "Garage%"))]
## GarageYrBlt GarageFinish GarageQual GarageCond GarageCars
## 2 2 2 2 1
## GarageArea
## 1Now we have 2 NA’s in each GarageYrBlt, GarageFinish, GarageQual, GarageCond variables and 1 NA each in GarageCars and GarageArea
kable(df[which(is.na(df$GarageYrBlt) | is.na(df$GarageFinish) | is.na(df$GarageQual) | is.na(df$GarageCond)), c("GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond", "GarageCars", "GarageArea")])| GarageYrBlt | GarageFinish | GarageQual | GarageCond | GarageCars | GarageArea | |
|---|---|---|---|---|---|---|
| 2127 | NA | NA | NA | NA | 1 | 360 |
| 2577 | NA | NA | NA | NA | NA | NA |
Is is evident that row 2127 has a Garage since the GarageArea is 360 and row 2577 has none. Fixing row 2577 values as below by assigning 0 value to integer variables and ‘None’ to character variables
df$GarageYrBlt[2577] <- 0
df$GarageFinish[2577] <- 'None'
df$GarageQual[2577] <- 'None'
df$GarageCond[2577] <- 'None'
df$GarageCars[2577] <- 0
df$GarageArea[2577] <- 0
df[2577, c("GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond", "GarageCars", "GarageArea")]Let us fix row 2127 for these variables. For GarageYrBlt variable, I will impute the YearBuild value. For other values I will impute with mode (the most frequent value) in the respective column
df$GarageYrBlt[2127] <- df$YearBuilt[2127]
df$GarageFinish[2127] <- names(sort(table(df$GarageFinish), decreasing=TRUE)[1])
df$GarageQual[2127] <- names(sort(table(df$GarageQual), decreasing=TRUE)[1])
df$GarageCond[2127] <- names(sort(table(df$GarageCond), decreasing=TRUE)[1])
df[2127, c("GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond", "GarageCars", "GarageArea")]
#Show the missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "Garage%"))]
## named numeric(0)Now, that all the NA’s are handled, I am going to evaluate the character variables. We have 4 character variables. Of these there are 3 ordered/ordinal variables (GarageFinish, GarageQual, GarageCond) and 1 unordered/multinomial (GarageType).
table(df$GarageType)
2Types Attchd Basment BuiltIn CarPort Detchd None
23 1723 36 186 15 779 157
table(df$GarageFinish)
Fin None RFn Unf
719 158 811 1231
table(df$GarageQual)
Ex Fa Gd None Po TA
3 124 24 158 5 2605
table(df$GarageCond)
Ex Fa Gd None Po TA
3 74 15 158 14 2655
I will convert GarageType to a factor since it does not have specific ordering to its values. I will assign numerical values to other character variables and convert them to Numeric type. I will decide later if I should treat ordinal variables as numeric or convert them to factors
df$GarageType <- factor(df$GarageType)
df$GarageFinish <- as.numeric(revalue(df$GarageFinish, c("None"=0, "Unf"=1, "RFn"=2, "Fin"=3)))
df$GarageQual <- as.numeric(revalue(df$GarageQual, c("None"=0, "Po"=1, "Fa"=2, "TA"=3, "Gd"=4, "Ex"=5)))
df$GarageCond <- as.numeric(revalue(df$GarageCond, c("None"=0, "Po"=1, "Fa"=2, "TA"=3, "Gd"=4, "Ex"=5)))
table(df$GarageType)
2Types Attchd Basment BuiltIn CarPort Detchd None
23 1723 36 186 15 779 157
table(df$GarageFinish)
0 1 2 3
158 1231 811 719
table(df$GarageQual)
0 1 2 3 4 5
158 5 124 2605 24 3
table(df$GarageCond)
0 1 2 3 4 5
158 14 74 2655 15 3
There are 11 Basement variables and all of them have NA’s.
df_NA[c(which(names(df_NA) %like% "%Bsmt%"))] BsmtCond BsmtExposure BsmtQual BsmtFinType2 BsmtFinType1
82 82 81 80 79
BsmtFullBath BsmtHalfBath BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
2 2 1 1 1
TotalBsmtSF
1
Among these variables I am going to figure out which of the variables BsmtCond, BsmtExposure, BsmtQual, BsmtFinType2 have commond NA row values with BsmtFinType1
length(which(is.na(df$BsmtCond) & is.na(df$BsmtExposure) & is.na(df$BsmtQual) & is.na(df$BsmtFinType2) & is.na(df$BsmtFinType1)))[1] 79
There are 79 rows that common across BsmtCond, BsmtExposure, BsmtQual, BsmtFinType2 and BsmtFinType1. Let’s fix the values for these variables by selecting the rows based on BsmtFinType1 NA’s since we know for sure that BsmtFinType1 NA’s are in the 82,81,80 NA’s for other Basement variables. We assign the value ‘None’ to all NA’s based on the variable description.
df$BsmtCond[is.na(df$BsmtFinType1)] <- 'None'
df$BsmtExposure[is.na(df$BsmtFinType1)] <- 'None'
df$BsmtQual[is.na(df$BsmtFinType1)] <- 'None'
df$BsmtFinType2[is.na(df$BsmtFinType1)] <- 'None'
df$BsmtFinType1[is.na(df$BsmtFinType1)] <- 'None'
#Show the missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Bsmt%"))] BsmtCond BsmtExposure BsmtQual BsmtFullBath BsmtHalfBath
3 3 2 2 2
BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
1 1 1 1 1
Now we have 3 NA’s in each BsmtCond, BsmtExposure, 2 NA’s each in BsmtQual, BsmtFullBath BsmtHalfBath and 1 NA each in BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, and TotalBsmtSF
kable(df[which(is.na(df$BsmtCond) | is.na(df$BsmtExposure) | is.na(df$BsmtQual) | is.na(df$BsmtFullBath) | is.na(df$BsmtHalfBath) | is.na(df$BsmtFinSF1) | is.na(df$BsmtFinType2) | is.na(df$BsmtFinSF2) | is.na(df$BsmtUnfSF) | is.na(df$TotalBsmtSF)), c("BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFullBath", "BsmtHalfBath","BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF")])| BsmtCond | BsmtExposure | BsmtQual | BsmtFullBath | BsmtHalfBath | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | |
|---|---|---|---|---|---|---|---|---|---|---|
| 333 | TA | No | Gd | 1 | 0 | 1124 | NA | 479 | 1603 | 3206 |
| 949 | TA | NA | Gd | 0 | 0 | 0 | Unf | 0 | 936 | 936 |
| 1488 | TA | NA | Gd | 0 | 0 | 0 | Unf | 0 | 1595 | 1595 |
| 2041 | NA | Mn | Gd | 1 | 0 | 1044 | Rec | 382 | 0 | 1426 |
| 2121 | None | None | None | NA | NA | NA | None | NA | NA | NA |
| 2186 | NA | No | TA | 0 | 1 | 1033 | Unf | 0 | 94 | 1127 |
| 2189 | None | None | None | NA | NA | 0 | None | 0 | 0 | 0 |
| 2218 | Fa | No | NA | 0 | 0 | 0 | Unf | 0 | 173 | 173 |
| 2219 | TA | No | NA | 0 | 0 | 0 | Unf | 0 | 356 | 356 |
| 2349 | TA | NA | Gd | 0 | 0 | 0 | Unf | 0 | 725 | 725 |
| 2525 | NA | Av | TA | 0 | 0 | 755 | Unf | 0 | 240 | 995 |
The output above indicates,
df$BsmtFinType2[333] <- names(sort(table(df$BsmtFinType2), decreasing=TRUE))[1]
df$BsmtExposure[c(949,1488,2349)] <- names(sort(table(df$BsmtExposure), decreasing=TRUE))[1]
df$BsmtCond[c(2041,2186,2525)] <- names(sort(table(df$BsmtCond), decreasing=TRUE))[1]
df$BsmtFullBath[c(2121,2189)] <- 0
df$BsmtHalfBath[c(2121,2189)] <- 0
df[2121, c("BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF")] <- 0
df$BsmtQual[c(2218,2219)] <- names(sort(table(df$BsmtQual), decreasing=TRUE))[1]Test if the NA’s are handled
kable(df[c(333,949,1488,2041,2121,2186,2189,2218,2219,2349,2525), c("BsmtCond", "BsmtExposure", "BsmtQual", "BsmtFullBath", "BsmtHalfBath","BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF")])| BsmtCond | BsmtExposure | BsmtQual | BsmtFullBath | BsmtHalfBath | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | |
|---|---|---|---|---|---|---|---|---|---|---|
| 333 | TA | No | Gd | 1 | 0 | 1124 | Unf | 479 | 1603 | 3206 |
| 949 | TA | No | Gd | 0 | 0 | 0 | Unf | 0 | 936 | 936 |
| 1488 | TA | No | Gd | 0 | 0 | 0 | Unf | 0 | 1595 | 1595 |
| 2041 | TA | Mn | Gd | 1 | 0 | 1044 | Rec | 382 | 0 | 1426 |
| 2121 | None | None | None | 0 | 0 | 0 | None | 0 | 0 | 0 |
| 2186 | TA | No | TA | 0 | 1 | 1033 | Unf | 0 | 94 | 1127 |
| 2189 | None | None | None | 0 | 0 | 0 | None | 0 | 0 | 0 |
| 2218 | Fa | No | TA | 0 | 0 | 0 | Unf | 0 | 173 | 173 |
| 2219 | TA | No | TA | 0 | 0 | 0 | Unf | 0 | 356 | 356 |
| 2349 | TA | No | Gd | 0 | 0 | 0 | Unf | 0 | 725 | 725 |
| 2525 | TA | Av | TA | 0 | 0 | 755 | Unf | 0 | 240 | 995 |
There are 5 character Basement variables, 3 of which are ordered (BsmtQual, BsmtCond, BsmtExposure) and 2 of which are unordered (BsmtFinType1, BsmtFinType2). I will convert the unordered variables to factors and assign numeric values to ordered varaibles and convert them to numeric types.
cbind(sapply(df[basement_vars], class)) [,1]
BsmtQual "character"
BsmtCond "character"
BsmtExposure "character"
BsmtFinType1 "character"
BsmtFinSF1 "numeric"
BsmtFinType2 "character"
BsmtFinSF2 "numeric"
BsmtUnfSF "numeric"
TotalBsmtSF "numeric"
BsmtFullBath "numeric"
BsmtHalfBath "numeric"
table(df$BsmtQual)
Ex Fa Gd None TA
258 88 1209 79 1285
table(df$BsmtCond)
Fa Gd None Po TA
104 122 79 5 2609
table(df$BsmtExposure)
Av Gd Mn No None
418 276 239 1907 79
table(df$BsmtFinType1)
ALQ BLQ GLQ LwQ None Rec Unf
429 269 849 154 79 288 851
table(df$BsmtFinType2)
ALQ BLQ GLQ LwQ None Rec Unf
52 68 34 87 79 105 2494
df$BsmtFinType1 <- as.factor(df$BsmtFinType1)
df$BsmtFinType2 <- as.factor(df$BsmtFinType2)
df$BsmtQual <- as.integer(revalue(df$BsmtQual, c("None"=0, "Fa"=1, "TA"=2, "Gd"=3, "Ex"=4)))
df$BsmtCond <- as.integer(revalue(df$BsmtCond, c("None"=0, "Po"=1, "Fa"=2, "TA"=3, "Gd"=4)))
df$BsmtExposure <- as.integer(revalue(df$BsmtExposure, c("None"=0, "No"=1, "Mn"=2, "Av"=3, "Gd"=4)))
table(df$BsmtQual)
0 1 2 3 4
79 88 1285 1209 258
table(df$BsmtCond)
0 1 2 3 4
79 5 104 2609 122
table(df$BsmtExposure)
0 1 2 3 4
79 1907 239 418 276
table(df$BsmtFinType1)
ALQ BLQ GLQ LwQ None Rec Unf
429 269 849 154 79 288 851
table(df$BsmtFinType2)
ALQ BLQ GLQ LwQ None Rec Unf
52 68 34 87 79 105 2494
There are 2 pool variables and only variable PoolQC have 2909 NA’s.
df_NA[c(which(names(df_NA) %like% pool_vars))]PoolQC
2909
For variable PoolQC, based on the description I am going to replace NA’s with None if the PoolArea is 0. It appears that there are 2906 PoolQA NA’s since there are 2906 PoolArea values with 0.
df$PoolQC[df$PoolArea==0] <- 'None'
#Show the missing value counts with their column names
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Pool%"))]PoolQC
3
df[(is.na(df$PoolQC)), c("PoolQC", "PoolArea")]We need to find the 3 missing PoolQC NA’s and their respective PoolArea values. Rows 2421, 2504, 2600 have PoolArea values but the PoolQC is missing. The NA values will be imputed with the mode
df$PoolQC[c(2421,2504,2600)] <- names(sort(table(df$PoolQC), decreasing=TRUE))[2]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Pool%"))]named numeric(0)
df$PoolQC[c(2421,2504,2600)][1] "Ex" "Ex" "Ex"
There is 1 Pool ordinal character variable and I will assign numeric values and convert it to integer type.
cbind(sapply(df[pool_vars], class)) [,1]
PoolArea "integer"
PoolQC "character"
table(df$PoolQC)
Ex Fa Gd None
7 2 4 2906
df$PoolQC <- as.integer(revalue(df$PoolQC, c("None"=0, "Fa"=1, "Gd"=2, "Ex"=3)))
table(df$PoolQC)
0 1 2 3
2906 2 4 7
There are 5 porch variables and no NA’s
WoodDeckSF: Wood deck area in square feet
OpenPorchSF: Open porch area in square feet
EnclosedPorch: Enclosed porch area in square feet
3SsnPorch: Three season porch area in square feet
ScreenPorch: Screen porch area in square feet
There are 5 integer porch variables and no NA’s.
df_NA[c(which(names(df_NA) %like% porch_vars))]named numeric(0)
cbind(sapply(df[porch_vars], class)) [,1]
WoodDeckSF "integer"
OpenPorchSF "integer"
EnclosedPorch "integer"
X3SsnPorch "integer"
ScreenPorch "integer"
There are 4 sale variables out of which SalePrice and SaleType have NA’s. The 1459 NA’s in SalePrice are due to the addition of missing ‘SalePrice’ column with NA’s assigned in test data that we need to predict.
df_NA[c(which(names(df_NA) %like% sale_vars))]SalePrice SaleType
1459 1
For SaleType ‘NA’ values, I will impute with the mode value. There are 2 character variables, ‘SaleType’ and ‘SaleCondition’. Both the character variables are unordered and will convert them to factors
df$SaleType[is.na(df$SaleType)] <- names(sort(table(df$SaleType), decreasing=TRUE))[1]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Sale%"))]SalePrice
1459
cbind(sapply(df[sale_vars], class)) [,1]
MoSold "integer"
YrSold "integer"
SaleType "character"
SaleCondition "character"
SalePrice "integer"
table(df$SaleType)
COD Con ConLD ConLI ConLw CWD New Oth WD
87 5 26 9 8 12 239 7 2526
table(df$SaleCondition)
Abnorml AdjLand Alloca Family Normal Partial
190 12 24 46 2402 245
df$SaleType <- as.factor(df$SaleType)
df$SaleCondition <- as.factor(df$SaleCondition)There are 6 variables related to Lot and Land. Variable ‘LotFrontage’ has 486 NA’s.
df_NA[c(which(names(df_NA) %like% lot_vars))]LotFrontage
486
cbind(sapply(df[lot_vars], class)) [,1]
LotFrontage "integer"
LotArea "integer"
LotShape "character"
LandContour "character"
LotConfig "character"
LandSlope "character"
The NA’s for the LotFrontage will be imputed with median value per Neighborhood variable since the values will be within bounds, while imputing with median can make the data look normally distributed.
lot_agg_median <- aggregate(list("LotFrontage_median"=df$LotFrontage), by=list("Neighborhood"=df$Neighborhood), FUN=median, na.rm=TRUE)
lot_agg_medianbarplot(lot_agg_median$LotFrontage, names.arg=lot_agg_median$Neighborhood, main="Median LotFrontage by Neighborhood", xlab="Neighborhoods", ylab="Median Lot Frontage", col="blue")for (i in 1:nrow(df)){
if(is.na(df$LotFrontage[i])){
df$LotFrontage[i] <- as.integer(median(df$LotFrontage[df$Neighborhood==df$Neighborhood[i]], na.rm=TRUE))
}
}
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% "%Lot%"))]named numeric(0)
There are 4 character variables. LotConfig and LandContour variables look like unordered varaibles and will convert them to factors. LotShape and LandSlope have ordinal values hence will assign numeric values and convert them to numeric variables
table(df$LotShape)
IR1 IR2 IR3 Reg
968 76 16 1859
table(df$LandContour)
Bnk HLS Low Lvl
117 120 60 2622
table(df$LotConfig)
Corner CulDSac FR2 FR3 Inside
511 176 85 14 2133
table(df$LandSlope)
Gtl Mod Sev
2778 125 16
df$LotConfig <- as.factor(df$LotConfig)
df$LandContour <- as.factor(df$LandContour)
df$LotShape <- as.numeric(revalue(df$LotShape, c('IR3'=0, 'IR2'=1, 'IR1'=2, 'Reg'=3)))
df$LandSlope <- as.numeric(revalue(df$LandSlope, c('Sev'=0, 'Mod'=1, 'Gtl'=2)))
table(df$LotShape)
0 1 2 3
16 76 968 1859
table(df$LandContour)
Bnk HLS Low Lvl
117 120 60 2622
table(df$LotConfig)
Corner CulDSac FR2 FR3 Inside
511 176 85 14 2133
table(df$LandSlope)
0 1 2
16 125 2778
There are 7 Dwelling varaibles with no NA’s. There are 2 character variables that are unordered hence I will convert them to factors. ‘MSSubClass’ is an integer variable that should be converted into a factor. Based on the description, it identifies the type of dwelling involved in the sale. It is coded numerically and is an unordered variable.
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% dwelling_vars))]named numeric(0)
cbind(sapply(df[dwelling_vars], class)) [,1]
MSSubClass "integer"
BldgType "character"
HouseStyle "character"
OverallQual "integer"
OverallCond "integer"
YearBuilt "integer"
YearRemodAdd "integer"
table(df$BldgType)
1Fam 2fmCon Duplex Twnhs TwnhsE
2425 62 109 96 227
table(df$HouseStyle)
1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer SLvl
314 19 1471 8 24 872 83 128
df$BldgType <- as.factor(df$BldgType)
df$HouseStyle <- as.factor(df$HouseStyle)
df$MSSubClass <- as.factor(as.character(df$MSSubClass))
cbind(sapply(df[dwelling_vars], class)) [,1]
MSSubClass "factor"
BldgType "factor"
HouseStyle "factor"
OverallQual "integer"
OverallCond "integer"
YearBuilt "integer"
YearRemodAdd "integer"
There are 13 exterior variables out of which 12 are character variables
# NA's for the exterior variables
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% exterior_vars))] Alley Fence MasVnrType MasVnrArea Exterior1st Exterior2nd
2721 2348 24 23 1 1
cbind(sapply(df[exterior_vars], class)) [,1]
Street "character"
Alley "character"
RoofStyle "character"
RoofMatl "character"
Exterior1st "character"
Exterior2nd "character"
MasVnrType "character"
MasVnrArea "integer"
ExterQual "character"
ExterCond "character"
Foundation "character"
PavedDrive "character"
Fence "character"
Fixing ‘Alley’ and ‘Fence’ variables
df$Alley[is.na(df$Alley)] <- 'None'
df$Fence[is.na(df$Fence)] <- 'None'
df[is.na(df$MasVnrType), c("MasVnrType", "MasVnrArea")]It is evident that row 2611 has MasVnrArea and a missing MasVnrType. I am going to impute the ‘NA’ with a mode (excluding ‘None’ if it is the most frequent).
df$MasVnrType[is.na(df$MasVnrArea)] <- 'None'
df$MasVnrArea[is.na(df$MasVnrArea)] <- 0
df[2611, "MasVnrType"] <- names(sort(table(df$MasVnrType), decreasing=TRUE))[2]
df[2611, c("MasVnrType", "MasVnrArea")]df[which(is.na(df$Exterior1st) | is.na(df$Exterior2nd)), c("Exterior1st", "Exterior2nd")]df$Exterior1st[is.na(df$Exterior1st)] <- names(sort(-table(df$Exterior1st)))[1]
df$Exterior2nd[is.na(df$Exterior2nd)] <- names(sort(-table(df$Exterior2nd)))[1]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% c("%Exter%", "%Roof%", "%MasVnr%", "%Foundation%", "%Street%", "%Alley%", "%PavedDrive%", "%Fence%")))]named numeric(0)
Variables ‘Street’, ‘Alley’, ‘RoofStyle’, ‘RoofMatl’, ‘Exterior1st’, ‘Exterior2nd’, ‘MasVnrType’, ‘Foundation’ and ‘PavedDrive’, “Fence” are unordered variables and will convert them to factors. Variables ‘ExterQual’, ‘ExterCond’ are ordered variables and assign numeric values to them an convert them to integer variables.
df[,c("Street", "Alley", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "PavedDrive", "Fence")] <- lapply(df[,c("Street", "Alley", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "PavedDrive", "Fence")], factor)
df$ExterQual <- as.integer(revalue(df$ExterQual, c("Fa"=0, "TA"=1, "Gd"=2, "Ex"=3)))
df$ExterCond <- as.integer(revalue(df$ExterCond, c("Po"=0, "Fa"=1, "TA"=2, "Gd"=3, "Ex"=4)))
table(df$ExterQual)
0 1 2 3
35 1798 979 107
table(df$ExterCond)
0 1 2 3 4
3 67 2538 299 12
table(df$Foundation)
BrkTil CBlock PConc Slab Stone Wood
311 1235 1308 49 11 5
table(df$PavedDrive)
N P Y
216 62 2641
table(df$Fence)
GdPrv GdWo MnPrv MnWw None
118 112 329 12 2348
cbind(sapply(df[exterior_vars], class)) [,1]
Street "factor"
Alley "factor"
RoofStyle "factor"
RoofMatl "factor"
Exterior1st "factor"
Exterior2nd "factor"
MasVnrType "factor"
MasVnrArea "numeric"
ExterQual "integer"
ExterCond "integer"
Foundation "factor"
PavedDrive "factor"
Fence "factor"
There are 5 utilities variables. Variable ‘Utilities’ has 2 NAs and ‘Electrical’ has 1 NA. All are character variables
cbind(sapply(df[utility_vars], class)) [,1]
Utilities "character"
Heating "character"
HeatingQC "character"
CentralAir "character"
Electrical "character"
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% utility_vars))] Utilities Electrical
2 1
df[is.na(df$Utilities) | is.na(df$Electrical), c("Utilities", "Electrical")]Fixing the variables with the mode values.
df[1380, "Electrical"] <- names(sort(table(df$Electrical), decreasing=TRUE))[1]
df[c(1916, 1946), c("Utilities")] <- names(sort(table(df$Utilities), decreasing=TRUE))[1]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% utility_vars))]named numeric(0)
table(df$Utilities)
AllPub NoSeWa
2918 1
table(df$Heating)
Floor GasA GasW Grav OthW Wall
1 2874 27 9 2 6
table(df$HeatingQC)
Ex Fa Gd Po TA
1493 92 474 3 857
table(df$CentralAir)
N Y
196 2723
table(df$Electrical)
FuseA FuseF FuseP Mix SBrkr
188 50 8 1 2672
Variables ‘Utilities’ and ‘Heating’ and ‘Electrical’ are unordered and hence will convert them to factors. Variables ‘HeatingQC’ and ‘CentralAir’ are ordered and will assigned numeric values and convert them to numeric type.
df$Utilities <- as.factor(df$Utilities)
df$Heating <- as.factor(df$Heating)
df$Electrical <- as.factor(df$Electrical)
df$HeatingQC <- as.integer(revalue(df$HeatingQC, c("Po"=0, "Fa"=1, "TA"=2, "Gd"=3, "Ex"=4)))
df$CentralAir <- as.integer(revalue(df$CentralAir, c("N"=0, "Y"=1)))
table(df$Utilities)
AllPub NoSeWa
2918 1
table(df$Heating)
Floor GasA GasW Grav OthW Wall
1 2874 27 9 2 6
table(df$HeatingQC)
0 1 2 3 4
3 92 857 474 1493
table(df$CentralAir)
0 1
196 2723
table(df$Electrical)
FuseA FuseF FuseP Mix SBrkr
188 50 8 1 2672
cbind(sapply(df[utility_vars], class)) [,1]
Utilities "factor"
Heating "factor"
HeatingQC "integer"
CentralAir "integer"
Electrical "factor"
cbind(sapply(df[interior_vars], class)) [,1]
X1stFlrSF "integer"
X2ndFlrSF "integer"
LowQualFinSF "integer"
GrLivArea "integer"
FullBath "integer"
HalfBath "integer"
BedroomAbvGr "integer"
KitchenAbvGr "integer"
KitchenQual "character"
TotRmsAbvGrd "integer"
Functional "character"
Fireplaces "integer"
FireplaceQu "character"
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% interior_vars))]FireplaceQu Functional KitchenQual
1420 2 1
Handling missing data
df$FireplaceQu[is.na(df$FireplaceQu)] <- 'None'
df[is.na(df$Functional) | is.na(df$KitchenQual), c("Functional", "KitchenQual")] df$KitchenQual[is.na(df$KitchenQual)] <- names(sort(table(df$KitchenQual), decreasing=TRUE))[1]
df$Functional[is.na(df$Functional)] <- names(sort(table(df$Functional), decreasing=TRUE))[1]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% interior_vars))]named numeric(0)
table(df$FireplaceQu)
Ex Fa Gd None Po TA
43 74 744 1420 46 592
table(df$Functional)
Maj1 Maj2 Min1 Min2 Mod Sev Typ
19 9 65 70 35 2 2719
table(df$KitchenQual)
Ex Fa Gd TA
205 70 1151 1493
All the three character variables are ordinal and will assign integer values and convert them to integer type variables.
df$FireplaceQu <- as.integer(revalue(df$FireplaceQu, c("None"=0, "Po"=1, "Fa"=2, "TA"=3, "Gd"=4, "Ex"=5)))
df$Functional <- as.integer(revalue(df$Functional, c("Sal"=0, "Sev"=1, "Maj2"=2, "Maj1"=3, "Mod"=4, "Min2"=5, "Min1"=6, "Typ"=7)))The following `from` values were not present in `x`: Sal
df$KitchenQual <- as.integer(revalue(df$KitchenQual, c("Fa"=0, "TA"=1, "Gd"=2, "Ex"=3)))
table(df$FireplaceQu)
0 1 2 3 4 5
1420 46 74 592 744 43
table(df$Functional)
1 2 3 4 5 6 7
2 9 19 35 70 65 2719
table(df$KitchenQual)
0 1 2 3
70 1493 1151 205
There are 2 varaibles and ‘MiscFeature’ has 2814 NA’s
cbind(sapply(df[misc_vars], class)) [,1]
MiscFeature "character"
MiscVal "integer"
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% c("%Misc%")))]MiscFeature
2814
I will assign ‘None’ to all the NA’s in this variable and convert it to a factor since there is no order and it is a multinomial variable.
df$MiscFeature[is.na(df$MiscFeature)] <- "None"
table(df$MiscFeature)
Gar2 None Othr Shed TenC
5 2814 4 95 1
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% c("%Misc%")))]named numeric(0)
df$MiscFeature <- as.factor(df$MiscFeature)
cbind(sapply(df[misc_vars], class)) [,1]
MiscFeature "factor"
MiscVal "integer"
MSZoning: Identifies the general zoning classification of the sale.
A Agriculture
C Commercial
FV Floating Village Residential
I Industrial
RH Residential High Density
RL Residential Low Density
RP Residential Low Density Park
RM Residential Medium Density
cbind(sapply(df[zoning_vars], class)) [,1]
MSZoning "character"
table(df[zoning_vars])
C (all) FV RH RL RM
25 139 26 2265 460
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% zoning_vars))]MSZoning
4
There are 4 NA in ‘MSZoning’ variable and I will impute it with mode. The variable is not ordinal and hence will convert it to a factor.
df$MSZoning[is.na(df$MSZoning)] <- names(sort(table(df$MSZoning), decreasing=TRUE))[1]
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% zoning_vars))]named numeric(0)
df$MSZoning <- as.factor(df$MSZoning)
cbind(sapply(df[zoning_vars], class)) [,1]
MSZoning "factor"
There are 3 varaibles with no NA’s
cbind(sapply(df[neighborhood_vars], class)) [,1]
Neighborhood "character"
Condition1 "character"
Condition2 "character"
NAcol <- which(colSums(is.na(df)) > 0)
df_NA <- sort(colSums(sapply(df[NAcol], is.na)), decreasing = TRUE)
df_NA[c(which(names(df_NA) %like% neighborhood_vars))]named numeric(0)
table(df$Neighborhood)
Blmngtn Blueste BrDale BrkSide ClearCr CollgCr Crawfor Edwards Gilbert
28 10 30 108 44 267 103 194 165
IDOTRR MeadowV Mitchel NAmes NoRidge NPkVill NridgHt NWAmes OldTown
93 37 114 443 71 23 166 131 239
Sawyer SawyerW Somerst StoneBr SWISU Timber Veenker
151 125 182 51 48 72 24
table(df$Condition1)
Artery Feedr Norm PosA PosN RRAe RRAn RRNe RRNn
92 164 2511 20 39 28 50 6 9
table(df$Condition2)
Artery Feedr Norm PosA PosN RRAe RRAn RRNn
5 13 2889 4 4 1 1 2
All the three variables are unordered and will convert them to factors
df$Neighborhood <- as.factor(df$Neighborhood)
df$Condition1 <- as.factor(df$Condition1)
df$Condition2 <- as.factor(df$Condition2)
cbind(sapply(df[neighborhood_vars], class)) [,1]
Neighborhood "factor"
Condition1 "factor"
Condition2 "factor"
Now that the variable handling is done, I will explore the data further to understand the relationship between variables and their importance as below -
### Group the factor and integer variables
numeric_cols <- unlist(sapply(df, is.numeric))
numeric_col_names <- names(df[,numeric_cols])
factor_cols <- unlist(sapply(df, is.factor))
names(df[,factor_cols])
[1] "MSSubClass" "MSZoning" "Street" "Alley"
[5] "LandContour" "Utilities" "LotConfig" "Neighborhood"
[9] "Condition1" "Condition2" "BldgType" "HouseStyle"
[13] "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd"
[17] "MasVnrType" "Foundation" "BsmtFinType1" "BsmtFinType2"
[21] "Heating" "Electrical" "GarageType" "PavedDrive"
[25] "Fence" "MiscFeature" "SaleType" "SaleCondition"
#Find linear combinations and eliminate the combination variables. Do not include SalePrice variable.
findLinearCombos(df[,numeric_col_names[-52]])
$linearCombos
$linearCombos[[1]]
[1] 18 15 16 17
$linearCombos[[2]]
[1] 24 21 22 23
$remove
[1] 18 24
### Skew and Kurtosis of integer variables prior to any transformations, feature engineering or one-hot encoding
as.data.frame(psych::describe(df[,numeric_cols]))[, c("mean", "median", "sd", "skew", "kurtosis")]findLinearCombos() identified that 1. TotalBsmtSF is a cumlative sum of “BsmtFinSF1” “BsmtFinSF2” “BsmtUnfSF” 2. “GrLivArea” is a cumlative sum of “X1stFlrSF” “X2ndFlrSF” “LowQualFinSF”
### Get a correlation of int variables
#### Plot the correlations of all numeric variables ####
df_corr <- cor(df[,numeric_cols], use="pairwise.complete.obs") #correlations of all numeric variables
#sort on decreasing correlations with SalePrice
df_corr_sorted <- as.matrix(sort(df_corr[,'SalePrice'], decreasing = TRUE))
#select only high corelations with Sales Price
df_high_corr <- names(which(apply(df_corr_sorted, 1, function(x) abs(x)>0.4)))
df_corr_matrix <- df_corr[df_high_corr, df_high_corr]
#corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt", upper="circle", lower="number")
corrplot(df_corr_matrix, method="number", tl.col="black", tl.srt=45)The correlation matrix above indicates the variables that are highly correlated with SalePrice. There is also a high correlation among some predictor variables. Inorder to reduce the effects of multi-collinearity we need to eliminate predictor variables that are highly correlated with other predictor variables. Before eliminating the variables I am going to run a Random Forests model for a test of significance and important variables.
set.seed(100)
#check variable importance using random forests
#rf_ranges <- list(ntree=c(500), mtry=5:30)
#rf_tune <- tune(randomForest, SalePrice ~ ., data=df[1:1460,], ranges=rf_ranges)
#rf_tune$best.parameters
#rf_best <- rf_tune$best.model
#rf_best
df_rf <- randomForest(x=df[1:1460, -80], y=df$SalePrice[1:1460], ntree=500, importance=TRUE)
varImpPlot(df_rf)The left plot above indicates the increase in MSE if a variable is taken out of the model. The right plot indicates the node purity, which is a loss function using which the splits are chosen. The loss function is MSE for regression and GINI-impurity for classification.
Now, I will proceed with feature engineering and feature elimination. I will start with each of the variable groups above, run a pair-wise correlation of each group with SalePrice while keeping in view the above correlation matrix and variable importance.
cor(df[,c("SalePrice", "GarageCars", "GarageArea", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond")], use="pairwise.complete.obs") SalePrice GarageCars GarageArea GarageYrBlt GarageFinish
SalePrice 1.0000000 0.6404092 0.6234314 0.2613664 0.5492468
GarageCars 0.6404092 1.0000000 0.8898902 0.5806926 0.5771599
GarageArea 0.6234314 0.8898902 1.0000000 0.5502094 0.5120565
GarageYrBlt 0.2613664 0.5806926 0.5502094 1.0000000 0.4830923
GarageFinish 0.5492468 0.5771599 0.5120565 0.4830923 1.0000000
GarageQual 0.2738391 0.5666054 0.5532910 0.9441342 0.4864983
GarageCond 0.2631908 0.5562588 0.5402007 0.9493199 0.4799565
GarageQual GarageCond
SalePrice 0.2738391 0.2631908
GarageCars 0.5666054 0.5562588
GarageArea 0.5532910 0.5402007
GarageYrBlt 0.9441342 0.9493199
GarageFinish 0.4864983 0.4799565
GarageQual 1.0000000 0.9463743
GarageCond 0.9463743 1.0000000
The count of Garage Cars is directly proportional to the Garage Area. Garage cars has a high correlation with SalePrice than GarageArea. Hence we drop the variable GarageArea. Similarly we drop GarageFinish. For now I am going to create list of variables that will be dropped and append to this list as I identify more.
drop_vars <- list("GarageArea", "GarageFinish")Predictor variable TotalBsmtSF is the cumulative sum of predictor variables BsmtFinSF1,BsmtFinSF2, BsmtUnfSF Correlation between TotalBsmtSF and the other Bsmt SquareFoot variable is 1. Hence dropping the variables BsmtFinSF1,BsmtFinSF2, BsmtUnfSF.
cor(df$TotalBsmtSF, (df$BsmtFinSF1+df$BsmtFinSF2+df$BsmtUnfSF))
[1] 1
cor(df[,c("SalePrice", "BsmtQual", "BsmtCond", "BsmtExposure", "TotalBsmtSF", "BsmtFullBath")], use="pairwise.complete.obs")
SalePrice BsmtQual BsmtCond BsmtExposure TotalBsmtSF
SalePrice 1.0000000 0.6229247 0.2126072 0.3750450 0.6135806
BsmtQual 0.6229247 1.0000000 0.5359296 0.4386610 0.5653541
BsmtCond 0.2126072 0.5359296 1.0000000 0.2701941 0.3854455
BsmtExposure 0.3750450 0.4386610 0.2701941 1.0000000 0.4081805
TotalBsmtSF 0.6135806 0.5653541 0.3854455 0.4081805 1.0000000
BsmtFullBath 0.2271222 0.2614480 0.1708422 0.3381548 0.3262518
BsmtFullBath
SalePrice 0.2271222
BsmtQual 0.2614480
BsmtCond 0.1708422
BsmtExposure 0.3381548
TotalBsmtSF 0.3262518
BsmtFullBath 1.0000000
drop_vars <- list.append(drop_vars, "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF")Decks are usually open and porches either have a roof or are screened.
Hence, I am going to consolidate all the porch variables and delete “OpenPorchSF”, “EnclosedPorch”, “X3SsnPorch”, “ScreenPorch” variables.
df$TotalPorchSF <- df$OpenPorchSF + df$EnclosedPorch + df$X3SsnPorch + df$ScreenPorch
cor(df[,c("SalePrice", "TotalPorchSF", "WoodDeckSF")], use="pairwise.complete.obs")
SalePrice TotalPorchSF WoodDeckSF
SalePrice 1.0000000 0.19573894 0.32441344
TotalPorchSF 0.1957389 1.00000000 -0.07489956
WoodDeckSF 0.3244134 -0.07489956 1.00000000
plot(df$TotalPorchSF, df$SalePrice)drop_vars <- list.append(drop_vars, "OpenPorchSF", "EnclosedPorch", "X3SsnPorch", "ScreenPorch" )
head(df$SalePrice)
[1] 208500 181500 223500 140000 250000 143000Adding all the porch variables did not seem to increase the correlation with SalePrice
GrLivArea is a cumulative sum of X1stFlrSF, X2ndFlrSF, LowQualFinSF and have a correlation of 1 with GrLivArea. Dropping X1stFlrSF, X2ndFlrSF, LowQualFinSF inorder to reduce multi-collinearity effects.
I am also going to consolidate the variables GrLivArea and TotalBsmtSF into TotalSF and drop the two variables
cor(df$GrLivArea, (df$X1stFlrSF+df$X2ndFlrSF+df$LowQualFinSF))
[1] 1
df$TotalSF <- df$GrLivArea + df$TotalBsmtSF
cor(df[,c("SalePrice", "GrLivArea", "TotalBsmtSF", "TotalSF")], use="pairwise.complete.obs")
SalePrice GrLivArea TotalBsmtSF TotalSF
SalePrice 1.0000000 0.7086245 0.6135806 0.7789588
GrLivArea 0.7086245 1.0000000 0.4457098 0.8717698
TotalBsmtSF 0.6135806 0.4457098 1.0000000 0.8271178
TotalSF 0.7789588 0.8717698 0.8271178 1.0000000
drop_vars <- list.append(drop_vars, "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF", "GrLivArea", "TotalBsmtSF")I will not touch the Quality variables since buyers depend on each quality variable first before being satisfied with the overall quality of the house.
Next, I am going to consolidate all the bathroom variables since buyers are interested in total bathroom count. I will multiply the half baths with .05 inorder to sum it up correctly. Below correlation clearly indicates that TotalBathRooms variable is a strong predictor than individual predictors. Thus, dropping all the Bath predictor variables and retaining the TotalBathRooms variable.
df$TotalBathRooms <- df$BsmtFullBath + (df$BsmtHalfBath * 0.5) + df$FullBath + (df$HalfBath * 0.5)
cor(df[,c("SalePrice", "TotalBathRooms", "BsmtFullBath", "BsmtHalfBath", "FullBath" , "HalfBath")], use="pairwise.complete.obs") SalePrice TotalBathRooms BsmtFullBath BsmtHalfBath
SalePrice 1.00000000 0.63173107 0.22712223 -0.01684415
TotalBathRooms 0.63173107 1.00000000 0.60252510 0.00501479
BsmtFullBath 0.22712223 0.60252510 1.00000000 -0.14865481
BsmtHalfBath -0.01684415 0.00501479 -0.14865481 1.00000000
FullBath 0.56066376 0.71326860 -0.01922449 -0.04711656
HalfBath 0.28410768 0.38848184 -0.03358243 -0.05845680
FullBath HalfBath
SalePrice 0.56066376 0.28410768
TotalBathRooms 0.71326860 0.38848184
BsmtFullBath -0.01922449 -0.03358243
BsmtHalfBath -0.04711656 -0.05845680
FullBath 1.00000000 0.15838609
HalfBath 0.15838609 1.00000000
drop_vars <- list.append(drop_vars, "BsmtFullBath", "BsmtHalfBath", "FullBath" , "HalfBath")I am going to take a look at the Room predictor variables. TotRmsAbvGrd is highly correlated with SalePrice, however, above correlation matrix indicates a high correlation with GrLivArea. Since GrLivArea is highly correlated with SalePrice than TotRmsAbvGrd is with SalePrice, we will drop TotRmsAbvGrd variable.
cor(df[,c("SalePrice", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd")], use="pairwise.complete.obs") SalePrice BedroomAbvGr KitchenAbvGr TotRmsAbvGrd
SalePrice 1.0000000 0.1682132 -0.1359074 0.5337232
BedroomAbvGr 0.1682132 1.0000000 0.2411822 0.6697372
KitchenAbvGr -0.1359074 0.2411822 1.0000000 0.2945694
TotRmsAbvGrd 0.5337232 0.6697372 0.2945694 1.0000000
drop_vars <- list.append(drop_vars, "TotRmsAbvGrd")Since buyers are interested in the age of the home. I am going to created a new variable ‘Age’ and calculate based on the YearBuilt, YrSold and MonthSold variables. I also converted month into year unit by dividing by 12. A plot of Age and SalePrice indicate a negative correlation. Dropping variables “MoSold”, “YrSold”, because they have a low positive correlation with “SalePrice”. Dropping “YearBuilt” since it has a high correlation with SalePrice and Age variables.
df$Age <- round(df$YrSold+df$MoSold/12 - df$YearBuilt,2)
cor(df[, c("SalePrice", "Age", "MoSold", "YrSold", "YearBuilt")],use="pairwise.complete.obs")
SalePrice Age MoSold YrSold YearBuilt
SalePrice 1.00000000 -0.52306381 0.04643225 -0.02892259 0.52289733
Age -0.52306381 1.00000000 -0.01312782 0.05453331 -0.99908182
MoSold 0.04643225 -0.01312782 1.00000000 -0.15389512 0.01393760
YrSold -0.02892259 0.05453331 -0.15389512 1.00000000 -0.01234419
YearBuilt 0.52289733 -0.99908182 0.01393760 -0.01234419 1.00000000
plot(df$Age, df$SalePrice)hist(df$Age)drop_vars <- list.append(drop_vars, "MoSold", "YrSold", "YearBuilt")Dropping the variables below and getting a list of numeric and character variables
drop_vars <- unlist(drop_vars)
df <- df[!names(df) %in% drop_vars]
numeric_cols <- unlist(sapply(df, is.numeric))
names(df[,numeric_cols]) [1] "LotFrontage" "LotArea" "LotShape" "LandSlope"
[5] "OverallQual" "OverallCond" "YearRemodAdd" "MasVnrArea"
[9] "ExterQual" "ExterCond" "BsmtQual" "BsmtCond"
[13] "BsmtExposure" "HeatingQC" "CentralAir" "BedroomAbvGr"
[17] "KitchenAbvGr" "KitchenQual" "Functional" "Fireplaces"
[21] "FireplaceQu" "GarageYrBlt" "GarageCars" "GarageQual"
[25] "GarageCond" "WoodDeckSF" "PoolArea" "PoolQC"
[29] "MiscVal" "SalePrice" "TotalPorchSF" "TotalSF"
[33] "TotalBathRooms" "Age"
factor_cols <- unlist(sapply(df, is.factor))
names(df[,factor_cols]) [1] "MSSubClass" "MSZoning" "Street" "Alley"
[5] "LandContour" "Utilities" "LotConfig" "Neighborhood"
[9] "Condition1" "Condition2" "BldgType" "HouseStyle"
[13] "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd"
[17] "MasVnrType" "Foundation" "BsmtFinType1" "BsmtFinType2"
[21] "Heating" "Electrical" "GarageType" "PavedDrive"
[25] "Fence" "MiscFeature" "SaleType" "SaleCondition"
In the next step, I am going to
The numeric columns also contain the ordinal variables. I will ignore the ordinal variables and look at the original numeric variables only.
Excluding the ordinal integer variables (revalued ordinal character variables), I will focus on the following numeric variables -
“LotFrontage”, “LotArea”, “MasVnrArea”, “TotalBsmtSF”, “GrLivArea”, “BedroomAbvGr” “KitchenAbvGr”, “Fireplaces”, “GarageCars”, “WoodDeckSF”, “PoolArea”, “MiscVal”, “SalePrice”, “TotalPorchSF”, “TotalBathRooms”
All the above variables will be transformed using the YeoJohnson method with the exception of SalePrice. YeoJohnson method handles zero’s and negative values. SalePrice will use a logarithmic transformation since submissions are evaluated on Root-Mean-Squared-Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price.
transform_vars <- c("LotFrontage", "LotArea", "MasVnrArea", "TotalSF", "BedroomAbvGr", "KitchenAbvGr", "Fireplaces", "GarageCars", "WoodDeckSF", "PoolArea", "MiscVal", "TotalPorchSF", "TotalBathRooms", "Age")
as.data.frame(psych::describe(df[transform_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
#Exploratory data analysis before transformation
EDA(df[transform_vars])
#Transform variables
preprocessParams <- preProcess(df[transform_vars], method=c("YeoJohnson"))
preprocessParams
Created from 2919 samples and 12 variables
Pre-processing:
- ignored (0)
- Yeo-Johnson transformation (12)
Lambda estimates for Yeo-Johnson transformation:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-1.5269 -0.1262 0.2315 0.1254 0.4265 1.2187
preprocessParams$method
$YeoJohnson
[1] "LotFrontage" "LotArea" "MasVnrArea" "TotalSF"
[5] "BedroomAbvGr" "KitchenAbvGr" "Fireplaces" "GarageCars"
[9] "WoodDeckSF" "TotalPorchSF" "TotalBathRooms" "Age"
$ignore
character(0)
transformed_df <- predict(preprocessParams, df)
hist(df$SalePrice)
transformed_df$SalePrice <- log(df$SalePrice)
#head(transformed_df)
as.data.frame(psych::describe(transformed_df[transform_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]#Exploratory data analysis after transformation
EDA(transformed_df[transform_vars])#Check variable importance post transformation
set.seed(100)
#check variable importance using random forests
#rf_ranges <- list(ntree=c(500), mtry=5:30)
#rf_tune <- tune(randomForest, SalePrice ~ ., data=df[1:1460,], ranges=rf_ranges)
#rf_tune$best.parameters
#rf_best <- rf_tune$best.model
#rf_best
names(transformed_df)
[1] "MSSubClass" "MSZoning" "LotFrontage" "LotArea"
[5] "Street" "Alley" "LotShape" "LandContour"
[9] "Utilities" "LotConfig" "LandSlope" "Neighborhood"
[13] "Condition1" "Condition2" "BldgType" "HouseStyle"
[17] "OverallQual" "OverallCond" "YearRemodAdd" "RoofStyle"
[21] "RoofMatl" "Exterior1st" "Exterior2nd" "MasVnrType"
[25] "MasVnrArea" "ExterQual" "ExterCond" "Foundation"
[29] "BsmtQual" "BsmtCond" "BsmtExposure" "BsmtFinType1"
[33] "BsmtFinType2" "Heating" "HeatingQC" "CentralAir"
[37] "Electrical" "BedroomAbvGr" "KitchenAbvGr" "KitchenQual"
[41] "Functional" "Fireplaces" "FireplaceQu" "GarageType"
[45] "GarageYrBlt" "GarageCars" "GarageQual" "GarageCond"
[49] "PavedDrive" "WoodDeckSF" "PoolArea" "PoolQC"
[53] "Fence" "MiscFeature" "MiscVal" "SaleType"
[57] "SaleCondition" "SalePrice" "TotalPorchSF" "TotalSF"
[61] "TotalBathRooms" "Age"
trans_df_rf <- randomForest(x=transformed_df[1:1460, -63], y=transformed_df$SalePrice[1:1460], ntree=500, importance=TRUE)
varImpPlot(trans_df_rf)trans_df_corr <- round(cor(transformed_df[c(transform_vars, "SalePrice")],use="pairwise.complete.obs"),2)
trans_cor_sorted <- as.matrix(sort(trans_df_corr[,'SalePrice'], decreasing = TRUE))
trans_cor_vars <- names(apply(trans_cor_sorted, 1, function(x) abs(x)>0))
corrplot(trans_df_corr[trans_cor_vars,trans_cor_vars], method="number", type="upper", tl.col="black", tl.srt=45)#Scatter plot matrix of variables
pairs(~SalePrice+Age+LotFrontage,data=transformed_df,
main="Simple Scatterplot Matrix")pairs(~SalePrice+LotArea+MasVnrArea,data=transformed_df,
main="Simple Scatterplot Matrix")pairs(~SalePrice+TotalSF+BedroomAbvGr,data=transformed_df,
main="Simple Scatterplot Matrix")pairs(~SalePrice+KitchenAbvGr+Fireplaces+GarageCars,data=transformed_df,
main="Simple Scatterplot Matrix")pairs(~SalePrice+WoodDeckSF+PoolArea,data=transformed_df,
main="Simple Scatterplot Matrix")pairs(~SalePrice+MiscVal+TotalPorchSF+TotalBathRooms,data=transformed_df,
main="Simple Scatterplot Matrix")Post transformation I will continue to work with ‘transfomed_df’ dataset. Now that the numeric variables are transformed. I will encode the nominal variables, in other words the factor variables. Since our dependant variable, SalePrice is an integer and the goal is to predict the SalePrice, we are going to run a regression model for which categorical data for nominal variables must be converted to numeric form. I am going to use the One-Hot encoding.
The nominal variables are as below -
factor_cols <- unlist(sapply(transformed_df, is.factor))
names(transformed_df[,factor_cols])
[1] "MSSubClass" "MSZoning" "Street" "Alley"
[5] "LandContour" "Utilities" "LotConfig" "Neighborhood"
[9] "Condition1" "Condition2" "BldgType" "HouseStyle"
[13] "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd"
[17] "MasVnrType" "Foundation" "BsmtFinType1" "BsmtFinType2"
[21] "Heating" "Electrical" "GarageType" "PavedDrive"
[25] "Fence" "MiscFeature" "SaleType" "SaleCondition"## Quick summary ##
#List of garage variables after feature engineering
garage_vars <- setdiff(garage_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[garage_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[garage_vars])EDA(transformed_df[garage_vars]) ## Quick summary ##
basement_vars <- setdiff(basement_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[basement_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[basement_vars])EDA(transformed_df[basement_vars]) ## Quick summary ##
pool_vars <- setdiff(pool_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[pool_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[pool_vars])EDA(transformed_df[pool_vars]) ## Quick summary ##
porch_vars <- setdiff(porch_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[porch_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[porch_vars])EDA(transformed_df[porch_vars]) ## Quick summary ##
sale_vars <- setdiff(sale_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[sale_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[sale_vars])EDA(transformed_df[sale_vars])## Quick summary ##
lot_vars <- setdiff(lot_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[lot_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[lot_vars])EDA(transformed_df[lot_vars])## Quick summary ##
dwelling_vars <- setdiff(dwelling_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[dwelling_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[dwelling_vars])EDA(transformed_df[dwelling_vars]) ## Quick summary ##
exterior_vars <- setdiff(exterior_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[exterior_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[exterior_vars])EDA(transformed_df[exterior_vars])## Quick summary ##
utility_vars <- setdiff(utility_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[utility_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[utility_vars])EDA(transformed_df[utility_vars]) ## Quick summary ##
interior_vars <- setdiff(interior_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[interior_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[interior_vars])EDA(transformed_df[interior_vars])mmisc_vars <- setdiff(misc_vars, drop_vars)
## Quick summary ##
as.data.frame(psych::describe(transformed_df[misc_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[misc_vars])EDA(transformed_df[misc_vars])## Quick summary ##
zoning_vars <- setdiff(zoning_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[zoning_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]
Warning in psych::describe(transformed_df[zoning_vars]): You were trying
to describe a non-numeric data.frame or vector which describe converted to
numeric.head(transformed_df[zoning_vars])EDA(transformed_df[zoning_vars])## Quick summary ##
neighborhood_vars <- setdiff(neighborhood_vars, drop_vars)
as.data.frame(psych::describe(transformed_df[neighborhood_vars]))[, c("mean", "median", "sd", "skew", "kurtosis")]head(transformed_df[neighborhood_vars])EDA(transformed_df[neighborhood_vars])##From CARET package
#fit_glm <= glm(transformed_df~., family="")
#varImp(fit_glm)
#A correaltion between the garage variables and
### Get a historgram of factor variables
### Identify most impact variables